{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "import numpy as np\n",
    "import matplotlib.pyplot as plt\n",
    "import seaborn as sns\n",
    "import os\n",
    "import codecs"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "seed = 1024\n",
    "np.random.seed(seed)\n",
    "\n",
    "path = '../data/'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "def get_corpus(corpus_path,corpus_file):\n",
    "    corpus = []\n",
    "    for f in corpus_file[0:]:\n",
    "        try:\n",
    "            with codecs.open(os.path.join(corpus_path+'/'+f), encoding='gbk') as f:\n",
    "                corpus.append(f.read())\n",
    "        except:\n",
    "            continue\n",
    "    return corpus\n",
    "\n",
    "def get_all_data(filename):\n",
    "    pos_path = path+filename+'/pos'\n",
    "    neg_path = path+filename+'/neg'\n",
    "\n",
    "    pos_files = os.listdir(pos_path)\n",
    "    neg_files = os.listdir(neg_path)\n",
    "\n",
    "    pos_corpus_1 = get_corpus(pos_path,pos_files)\n",
    "    neg_corpus_1 = get_corpus(neg_path,neg_files)\n",
    "\n",
    "    return pos_corpus_1,neg_corpus_1\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 30,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "def generate_dataset(corpus,mode='pos',filed='movie',lan='ch'):\n",
    "    subset = pd.DataFrame()\n",
    "    if mode=='pos':\n",
    "        subset['context'] = corpus\n",
    "        subset['field'] = filed\n",
    "        subset['label'] = 1\n",
    "        subset['lan'] = lan\n",
    "    else:\n",
    "        subset['context'] = corpus\n",
    "        subset['field'] = filed\n",
    "        subset['label'] = 0\n",
    "        subset['lan'] = lan\n",
    "    return subset"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 33,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "#read all corpus\n",
    "pos_corpus_1,neg_corpus_1 =get_all_data('corpus_1')\n",
    "pos_corpus_2,neg_corpus_2 = get_all_data('corpus_2')\n",
    "imdb = pd.read_csv(path+\"/corpus_3/labeledTrainData.tsv\", header=0, delimiter=\"\\t\", quoting=3)\n",
    "imdb['field'] = 'movie'\n",
    "imdb['lan'] = 'en'\n",
    "imdb = imdb[['review','sentiment','field','lan']]\n",
    "imdb.columns = ['context','label','field','lan']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 34,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "dataset = pd.DataFrame()\n",
    "\n",
    "s1 = generate_dataset(pos_corpus_1,mode='pos',filed='hotel',lan='ch')\n",
    "s2 = generate_dataset(neg_corpus_1,mode='neg',filed='hotel',lan='ch')\n",
    "s3 = generate_dataset(pos_corpus_2,mode='pos',filed='movie',lan='en')\n",
    "s4 = generate_dataset(neg_corpus_2,mode='neg',filed='movie',lan='en')\n",
    "\n",
    "dataset = pd.concat([dataset,s1])\n",
    "dataset = pd.concat([dataset,s2])\n",
    "dataset = pd.concat([dataset,s3])\n",
    "dataset = pd.concat([dataset,s4])\n",
    "dataset = pd.concat([dataset,imdb])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 35,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>context</th>\n",
       "      <th>field</th>\n",
       "      <th>label</th>\n",
       "      <th>lan</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>距离川沙公路较近,但是公交指示不对,如果是\"蔡陆线\"的话,会非常麻烦.建议用别的路线.房间较...</td>\n",
       "      <td>hotel</td>\n",
       "      <td>1</td>\n",
       "      <td>ch</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>商务大床房，房间很大，床有2M宽，整体感觉经济实惠不错!\\r\\r\\n\\r\\n\\r\\n</td>\n",
       "      <td>hotel</td>\n",
       "      <td>1</td>\n",
       "      <td>ch</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>早餐太差，无论去多少人，那边也不加食品的。酒店应该重视一下这个问题了。\\r\\r\\n房间本身很...</td>\n",
       "      <td>hotel</td>\n",
       "      <td>1</td>\n",
       "      <td>ch</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>宾馆在小街道上，不大好找，但还好北京热心同胞很多~\\r\\r\\n宾馆设施跟介绍的差不多，房间很...</td>\n",
       "      <td>hotel</td>\n",
       "      <td>1</td>\n",
       "      <td>ch</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>CBD中心,周围没什么店铺,说5星有点勉强.不知道为什么卫生间没有电吹风\\r\\r\\n\\r\\n...</td>\n",
       "      <td>hotel</td>\n",
       "      <td>1</td>\n",
       "      <td>ch</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>总的来说，这样的酒店配这样的价格还算可以，希望他赶快装修，给我的客人留些好的印象\\r\\r\\n...</td>\n",
       "      <td>hotel</td>\n",
       "      <td>1</td>\n",
       "      <td>ch</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6</th>\n",
       "      <td>价格比比较不错的酒店。这次免费升级了，感谢前台服务员。房子还好，地毯是新的，比上次的好些。早...</td>\n",
       "      <td>hotel</td>\n",
       "      <td>1</td>\n",
       "      <td>ch</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>7</th>\n",
       "      <td>不错，在同等档次酒店中应该是值得推荐的！\\r\\r\\n\\r\\n\\r\\n</td>\n",
       "      <td>hotel</td>\n",
       "      <td>1</td>\n",
       "      <td>ch</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>8</th>\n",
       "      <td>入住丽晶，感觉很好。因为是新酒店，的确有淡淡的油漆味， 房间内较新。房间大小合适，卫生间设备...</td>\n",
       "      <td>hotel</td>\n",
       "      <td>1</td>\n",
       "      <td>ch</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>9</th>\n",
       "      <td>1。酒店比较新，装潢和设施还不错，只是房间有些油漆味。\\r\\r\\n2。早餐还可以，只是品种不...</td>\n",
       "      <td>hotel</td>\n",
       "      <td>1</td>\n",
       "      <td>ch</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>10</th>\n",
       "      <td>我住的是特色标间，所谓特色，是有些类似家的感觉。寝具不是单调的白色，是条纹和大格子的，感觉很...</td>\n",
       "      <td>hotel</td>\n",
       "      <td>1</td>\n",
       "      <td>ch</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>11</th>\n",
       "      <td>早餐很丰富，服务也热情，早上很早退房时，前台值此人员办理手续也非常快．\\r\\r\\n\\r\\n\\r\\n</td>\n",
       "      <td>hotel</td>\n",
       "      <td>1</td>\n",
       "      <td>ch</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>12</th>\n",
       "      <td>这次是308的行政大床，总体感觉非常不错，就是价格稍许高了点，旁边有个五星的豪华客房才398...</td>\n",
       "      <td>hotel</td>\n",
       "      <td>1</td>\n",
       "      <td>ch</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>13</th>\n",
       "      <td>好像跟洪崖洞酒店一个公司的，所以房间的风格比较接近，就是小一些，整体来说比较不错，房间内食品...</td>\n",
       "      <td>hotel</td>\n",
       "      <td>1</td>\n",
       "      <td>ch</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>14</th>\n",
       "      <td>前台　楼层服务员都不错，房间安静整洁，交通方便，吃的周围也挺多．唯一不足，卫生间地漏设计不好...</td>\n",
       "      <td>hotel</td>\n",
       "      <td>1</td>\n",
       "      <td>ch</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>15</th>\n",
       "      <td>这个宾馆总体感觉还不错，无论从价格、出行、购物，还是订机票（国航就在它的右边）都很方便。\\r...</td>\n",
       "      <td>hotel</td>\n",
       "      <td>1</td>\n",
       "      <td>ch</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>16</th>\n",
       "      <td>价格偏高,服务一般.窗外风景ok.早餐还不错\\r\\r\\n\\r\\n\\r\\n</td>\n",
       "      <td>hotel</td>\n",
       "      <td>1</td>\n",
       "      <td>ch</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>17</th>\n",
       "      <td>酒店正在申定五星中，不过目前看来四星都有点勉强。\\r\\r\\n大堂很气派，不过细节很粗糙。硬件...</td>\n",
       "      <td>hotel</td>\n",
       "      <td>1</td>\n",
       "      <td>ch</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>18</th>\n",
       "      <td>闹中取静的一个地方，在窗前能看到不错的风景。酒店价格的确有些偏高\\r\\r\\n\\r\\n\\r\\n</td>\n",
       "      <td>hotel</td>\n",
       "      <td>1</td>\n",
       "      <td>ch</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>19</th>\n",
       "      <td>价格偏高,好象连云港这地方的酒店都偏贵.早饭不好.房间还不错,窗外风景还行.最重要是房间的窗...</td>\n",
       "      <td>hotel</td>\n",
       "      <td>1</td>\n",
       "      <td>ch</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>20</th>\n",
       "      <td>知道网线接口在哪儿吗？比高家庄的地道口还隐蔽。在床头柜后面！想不道吧？看你怎么用。1：自带4...</td>\n",
       "      <td>hotel</td>\n",
       "      <td>1</td>\n",
       "      <td>ch</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>21</th>\n",
       "      <td>沈阳市政府的酒店，比较大气，交通便利，出门往左就是北陵公园，环境好。\\r\\r\\n\\r\\n\\r\\n</td>\n",
       "      <td>hotel</td>\n",
       "      <td>1</td>\n",
       "      <td>ch</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>22</th>\n",
       "      <td>政府酒店感觉很气派,而且很干净,整个酒店的房间布局也很整齐.5月份入住的,由于第一天房间不能...</td>\n",
       "      <td>hotel</td>\n",
       "      <td>1</td>\n",
       "      <td>ch</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>23</th>\n",
       "      <td>总体还可以,就是前台服务员还不够敬业,在登记入住后,看都没看是不是本人,就把我客人的护照给了...</td>\n",
       "      <td>hotel</td>\n",
       "      <td>1</td>\n",
       "      <td>ch</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>24</th>\n",
       "      <td>6月下旬去丹东，在携程网上预定了丹东国际酒店2天的住宿。24日傍晚抵丹东，便打车去酒店。丹东...</td>\n",
       "      <td>hotel</td>\n",
       "      <td>1</td>\n",
       "      <td>ch</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>25</th>\n",
       "      <td>这次去北京，是要去北师大办事，所以特意留意了下附近的宾馆。住了两天，首先该宾馆很好找，离西四...</td>\n",
       "      <td>hotel</td>\n",
       "      <td>1</td>\n",
       "      <td>ch</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>26</th>\n",
       "      <td>房间设备太破,连喷头都是不好用,空调几乎感觉不到,虽然我开了最大\\r\\r\\n另外就是设备维修...</td>\n",
       "      <td>hotel</td>\n",
       "      <td>1</td>\n",
       "      <td>ch</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>27</th>\n",
       "      <td>总体感觉不错，美中不足的是房与房之间隔音效果不太好，还有房价感觉稍高。\\r\\r\\n\\r\\n\\r\\n</td>\n",
       "      <td>hotel</td>\n",
       "      <td>1</td>\n",
       "      <td>ch</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>28</th>\n",
       "      <td>设施虽稍陈旧一些,但良好的服务给人温暖.酒店所处地理位置和环境极好.对年龄较大的个人旅游者而...</td>\n",
       "      <td>hotel</td>\n",
       "      <td>1</td>\n",
       "      <td>ch</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>29</th>\n",
       "      <td>房间比较干净，卫生间太小，转身都费劲。地理位置一半，因在胡同里，不容易找到。服务一般。\\r\\...</td>\n",
       "      <td>hotel</td>\n",
       "      <td>1</td>\n",
       "      <td>ch</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>24970</th>\n",
       "      <td>\"Red Rock West (1993)&lt;br /&gt;&lt;br /&gt;Nicolas Cage ...</td>\n",
       "      <td>movie</td>\n",
       "      <td>1</td>\n",
       "      <td>en</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>24971</th>\n",
       "      <td>\"what can i say?, ms Erika Eleniak is my favor...</td>\n",
       "      <td>movie</td>\n",
       "      <td>1</td>\n",
       "      <td>en</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>24972</th>\n",
       "      <td>\"The spoiler warning is for those people who w...</td>\n",
       "      <td>movie</td>\n",
       "      <td>1</td>\n",
       "      <td>en</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>24973</th>\n",
       "      <td>\"What do you call a horror story without horro...</td>\n",
       "      <td>movie</td>\n",
       "      <td>0</td>\n",
       "      <td>en</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>24974</th>\n",
       "      <td>\"Though not a horror film in the traditional s...</td>\n",
       "      <td>movie</td>\n",
       "      <td>1</td>\n",
       "      <td>en</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>24975</th>\n",
       "      <td>\"This was what black society was like before t...</td>\n",
       "      <td>movie</td>\n",
       "      <td>1</td>\n",
       "      <td>en</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>24976</th>\n",
       "      <td>\"They probably should have called this movie T...</td>\n",
       "      <td>movie</td>\n",
       "      <td>0</td>\n",
       "      <td>en</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>24977</th>\n",
       "      <td>\"Attractive Marjorie(Farrah Fawcett)lives in f...</td>\n",
       "      <td>movie</td>\n",
       "      <td>1</td>\n",
       "      <td>en</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>24978</th>\n",
       "      <td>\"Vaguely reminiscent of great 1940's westerns,...</td>\n",
       "      <td>movie</td>\n",
       "      <td>1</td>\n",
       "      <td>en</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>24979</th>\n",
       "      <td>\"I admit I had no idea what to expect before v...</td>\n",
       "      <td>movie</td>\n",
       "      <td>1</td>\n",
       "      <td>en</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>24980</th>\n",
       "      <td>\"To me, the final scene, in which Harris respo...</td>\n",
       "      <td>movie</td>\n",
       "      <td>1</td>\n",
       "      <td>en</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>24981</th>\n",
       "      <td>\"This is by far the funniest short made by the...</td>\n",
       "      <td>movie</td>\n",
       "      <td>1</td>\n",
       "      <td>en</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>24982</th>\n",
       "      <td>\"To be a Buster Keaton fan is to have your hea...</td>\n",
       "      <td>movie</td>\n",
       "      <td>0</td>\n",
       "      <td>en</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>24983</th>\n",
       "      <td>\"I was one of those \\\"few Americans\\\" that gre...</td>\n",
       "      <td>movie</td>\n",
       "      <td>0</td>\n",
       "      <td>en</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>24984</th>\n",
       "      <td>\"Visually disjointed and full of itself, the d...</td>\n",
       "      <td>movie</td>\n",
       "      <td>0</td>\n",
       "      <td>en</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>24985</th>\n",
       "      <td>\"this movie had more holes than a piece of swi...</td>\n",
       "      <td>movie</td>\n",
       "      <td>0</td>\n",
       "      <td>en</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>24986</th>\n",
       "      <td>\"Last November, I had a chance to see this fil...</td>\n",
       "      <td>movie</td>\n",
       "      <td>1</td>\n",
       "      <td>en</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>24987</th>\n",
       "      <td>\"First off, I'd like to make a correction on a...</td>\n",
       "      <td>movie</td>\n",
       "      <td>1</td>\n",
       "      <td>en</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>24988</th>\n",
       "      <td>\"While originally reluctant to jump on the ban...</td>\n",
       "      <td>movie</td>\n",
       "      <td>1</td>\n",
       "      <td>en</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>24989</th>\n",
       "      <td>\"I heard about this movie when watching VH1's ...</td>\n",
       "      <td>movie</td>\n",
       "      <td>1</td>\n",
       "      <td>en</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>24990</th>\n",
       "      <td>\"I've never been huge on IMAX films. They're c...</td>\n",
       "      <td>movie</td>\n",
       "      <td>1</td>\n",
       "      <td>en</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>24991</th>\n",
       "      <td>\"Steve McQueen has certainly a lot of loyal fa...</td>\n",
       "      <td>movie</td>\n",
       "      <td>0</td>\n",
       "      <td>en</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>24992</th>\n",
       "      <td>\"Sometimes you wonder how some people get fund...</td>\n",
       "      <td>movie</td>\n",
       "      <td>0</td>\n",
       "      <td>en</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>24993</th>\n",
       "      <td>\"I am a student of film, and have been for sev...</td>\n",
       "      <td>movie</td>\n",
       "      <td>0</td>\n",
       "      <td>en</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>24994</th>\n",
       "      <td>\"Unimaginably stupid, redundant and humiliatin...</td>\n",
       "      <td>movie</td>\n",
       "      <td>0</td>\n",
       "      <td>en</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>24995</th>\n",
       "      <td>\"It seems like more consideration has gone int...</td>\n",
       "      <td>movie</td>\n",
       "      <td>0</td>\n",
       "      <td>en</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>24996</th>\n",
       "      <td>\"I don't believe they made this film. Complete...</td>\n",
       "      <td>movie</td>\n",
       "      <td>0</td>\n",
       "      <td>en</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>24997</th>\n",
       "      <td>\"Guy is a loser. Can't get girls, needs to bui...</td>\n",
       "      <td>movie</td>\n",
       "      <td>0</td>\n",
       "      <td>en</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>24998</th>\n",
       "      <td>\"This 30 minute documentary Buñuel made in the...</td>\n",
       "      <td>movie</td>\n",
       "      <td>0</td>\n",
       "      <td>en</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>24999</th>\n",
       "      <td>\"I saw this movie as a child and it broke my h...</td>\n",
       "      <td>movie</td>\n",
       "      <td>1</td>\n",
       "      <td>en</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>30920 rows × 4 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "                                                 context  field  label lan\n",
       "0      距离川沙公路较近,但是公交指示不对,如果是\"蔡陆线\"的话,会非常麻烦.建议用别的路线.房间较...  hotel      1  ch\n",
       "1             商务大床房，房间很大，床有2M宽，整体感觉经济实惠不错!\\r\\r\\n\\r\\n\\r\\n  hotel      1  ch\n",
       "2      早餐太差，无论去多少人，那边也不加食品的。酒店应该重视一下这个问题了。\\r\\r\\n房间本身很...  hotel      1  ch\n",
       "3      宾馆在小街道上，不大好找，但还好北京热心同胞很多~\\r\\r\\n宾馆设施跟介绍的差不多，房间很...  hotel      1  ch\n",
       "4      CBD中心,周围没什么店铺,说5星有点勉强.不知道为什么卫生间没有电吹风\\r\\r\\n\\r\\n...  hotel      1  ch\n",
       "5      总的来说，这样的酒店配这样的价格还算可以，希望他赶快装修，给我的客人留些好的印象\\r\\r\\n...  hotel      1  ch\n",
       "6      价格比比较不错的酒店。这次免费升级了，感谢前台服务员。房子还好，地毯是新的，比上次的好些。早...  hotel      1  ch\n",
       "7                     不错，在同等档次酒店中应该是值得推荐的！\\r\\r\\n\\r\\n\\r\\n  hotel      1  ch\n",
       "8      入住丽晶，感觉很好。因为是新酒店，的确有淡淡的油漆味， 房间内较新。房间大小合适，卫生间设备...  hotel      1  ch\n",
       "9      1。酒店比较新，装潢和设施还不错，只是房间有些油漆味。\\r\\r\\n2。早餐还可以，只是品种不...  hotel      1  ch\n",
       "10     我住的是特色标间，所谓特色，是有些类似家的感觉。寝具不是单调的白色，是条纹和大格子的，感觉很...  hotel      1  ch\n",
       "11     早餐很丰富，服务也热情，早上很早退房时，前台值此人员办理手续也非常快．\\r\\r\\n\\r\\n\\r\\n  hotel      1  ch\n",
       "12     这次是308的行政大床，总体感觉非常不错，就是价格稍许高了点，旁边有个五星的豪华客房才398...  hotel      1  ch\n",
       "13     好像跟洪崖洞酒店一个公司的，所以房间的风格比较接近，就是小一些，整体来说比较不错，房间内食品...  hotel      1  ch\n",
       "14     前台　楼层服务员都不错，房间安静整洁，交通方便，吃的周围也挺多．唯一不足，卫生间地漏设计不好...  hotel      1  ch\n",
       "15     这个宾馆总体感觉还不错，无论从价格、出行、购物，还是订机票（国航就在它的右边）都很方便。\\r...  hotel      1  ch\n",
       "16                  价格偏高,服务一般.窗外风景ok.早餐还不错\\r\\r\\n\\r\\n\\r\\n  hotel      1  ch\n",
       "17     酒店正在申定五星中，不过目前看来四星都有点勉强。\\r\\r\\n大堂很气派，不过细节很粗糙。硬件...  hotel      1  ch\n",
       "18        闹中取静的一个地方，在窗前能看到不错的风景。酒店价格的确有些偏高\\r\\r\\n\\r\\n\\r\\n  hotel      1  ch\n",
       "19     价格偏高,好象连云港这地方的酒店都偏贵.早饭不好.房间还不错,窗外风景还行.最重要是房间的窗...  hotel      1  ch\n",
       "20     知道网线接口在哪儿吗？比高家庄的地道口还隐蔽。在床头柜后面！想不道吧？看你怎么用。1：自带4...  hotel      1  ch\n",
       "21      沈阳市政府的酒店，比较大气，交通便利，出门往左就是北陵公园，环境好。\\r\\r\\n\\r\\n\\r\\n  hotel      1  ch\n",
       "22     政府酒店感觉很气派,而且很干净,整个酒店的房间布局也很整齐.5月份入住的,由于第一天房间不能...  hotel      1  ch\n",
       "23     总体还可以,就是前台服务员还不够敬业,在登记入住后,看都没看是不是本人,就把我客人的护照给了...  hotel      1  ch\n",
       "24     6月下旬去丹东，在携程网上预定了丹东国际酒店2天的住宿。24日傍晚抵丹东，便打车去酒店。丹东...  hotel      1  ch\n",
       "25     这次去北京，是要去北师大办事，所以特意留意了下附近的宾馆。住了两天，首先该宾馆很好找，离西四...  hotel      1  ch\n",
       "26     房间设备太破,连喷头都是不好用,空调几乎感觉不到,虽然我开了最大\\r\\r\\n另外就是设备维修...  hotel      1  ch\n",
       "27     总体感觉不错，美中不足的是房与房之间隔音效果不太好，还有房价感觉稍高。\\r\\r\\n\\r\\n\\r\\n  hotel      1  ch\n",
       "28     设施虽稍陈旧一些,但良好的服务给人温暖.酒店所处地理位置和环境极好.对年龄较大的个人旅游者而...  hotel      1  ch\n",
       "29     房间比较干净，卫生间太小，转身都费劲。地理位置一半，因在胡同里，不容易找到。服务一般。\\r\\...  hotel      1  ch\n",
       "...                                                  ...    ...    ...  ..\n",
       "24970  \"Red Rock West (1993)<br /><br />Nicolas Cage ...  movie      1  en\n",
       "24971  \"what can i say?, ms Erika Eleniak is my favor...  movie      1  en\n",
       "24972  \"The spoiler warning is for those people who w...  movie      1  en\n",
       "24973  \"What do you call a horror story without horro...  movie      0  en\n",
       "24974  \"Though not a horror film in the traditional s...  movie      1  en\n",
       "24975  \"This was what black society was like before t...  movie      1  en\n",
       "24976  \"They probably should have called this movie T...  movie      0  en\n",
       "24977  \"Attractive Marjorie(Farrah Fawcett)lives in f...  movie      1  en\n",
       "24978  \"Vaguely reminiscent of great 1940's westerns,...  movie      1  en\n",
       "24979  \"I admit I had no idea what to expect before v...  movie      1  en\n",
       "24980  \"To me, the final scene, in which Harris respo...  movie      1  en\n",
       "24981  \"This is by far the funniest short made by the...  movie      1  en\n",
       "24982  \"To be a Buster Keaton fan is to have your hea...  movie      0  en\n",
       "24983  \"I was one of those \\\"few Americans\\\" that gre...  movie      0  en\n",
       "24984  \"Visually disjointed and full of itself, the d...  movie      0  en\n",
       "24985  \"this movie had more holes than a piece of swi...  movie      0  en\n",
       "24986  \"Last November, I had a chance to see this fil...  movie      1  en\n",
       "24987  \"First off, I'd like to make a correction on a...  movie      1  en\n",
       "24988  \"While originally reluctant to jump on the ban...  movie      1  en\n",
       "24989  \"I heard about this movie when watching VH1's ...  movie      1  en\n",
       "24990  \"I've never been huge on IMAX films. They're c...  movie      1  en\n",
       "24991  \"Steve McQueen has certainly a lot of loyal fa...  movie      0  en\n",
       "24992  \"Sometimes you wonder how some people get fund...  movie      0  en\n",
       "24993  \"I am a student of film, and have been for sev...  movie      0  en\n",
       "24994  \"Unimaginably stupid, redundant and humiliatin...  movie      0  en\n",
       "24995  \"It seems like more consideration has gone int...  movie      0  en\n",
       "24996  \"I don't believe they made this film. Complete...  movie      0  en\n",
       "24997  \"Guy is a loser. Can't get girls, needs to bui...  movie      0  en\n",
       "24998  \"This 30 minute documentary Buñuel made in the...  movie      0  en\n",
       "24999  \"I saw this movie as a child and it broke my h...  movie      1  en\n",
       "\n",
       "[30920 rows x 4 columns]"
      ]
     },
     "execution_count": 35,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "dataset"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 36,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "pd.to_pickle(dataset,path+'corpus.pkl')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.0"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
